__Author__ = "Peter Herman"
__Project__ = "DICL Dataset"
__Created__ = "August 13, 2024"
__Description__ = '''The script produces version 2 the DICL Dataset using the "raw" source data from Ethnologue.'''


import pandas as pd
import os
from math import comb
import numpy as np

# --------------------------------------------------------------------------
# This script constructs the DICL indices using source data from Ethnologue.
#
# Inputs:
#      The script draws on 3 inputs to construct the data:#
#           1. Ethnologue data describing the number of speakers of each language in each country (speaker_data.csv)
#           2. Ethnologue data describing the linguistic family trees of each language (language_trees.csv)
#           3. A comma separated text file of country names and ISO 3-digit codes (country_codes.dlm)
#
#      We are unable to provide inputs 1. and 2. but they are available from Ethnologue. Input 3. is included in
#      the repository with this code file.
#
# Outputs:
#      The script creates two versions of the DICL dataset in different formats:
#           1. A comma-separated text version (DICL_v2.csv)
#           2. A Stata binary version (DICL_v2.dta)
#
#      The script also produces a readme file automatically, which details the contents of the dataset (README_DICL.txt)
#
#      Finally, the script also produces multiple intermediary files (1 for each set of indices and a folder containing
#      sub-calculations for the linguistic proximity measures). These can be discarded after completion but offer
#      a means of running the script in multiple discrete stages, which may be helpful on less-powerful computers.
#
# Outline
#      1. Define and load inputs
#      2. Create common official language indices (COR and COL)
#      3. Create common native language index (CNL)
#      4. Create common acquired language index (CAL)
#      5. Create common spoken (native + acquired) language index (CSL)
#      6. Create language tree proximity measures
#      7. Create native linguistic proximity measures (LPN and BPN)
#      8. Create acquired linguistic proximity measures (LPA and BPA)
#      9. Create spoken linguistic proximity measures (LPS and BPS)
#     10. Combine indices and save to file, create README
# ---------------------------------------------------------------------------


# ***  MUST UPDATE TO RUN ***
# Set working directory, which should be the folder containing the input data and will be where outputs are saved
working_directory = ""
# ***************************


# -----
# 1. Define and load inputs
# -----

print("Step 1")

# Set suffix for output files
version = 'v2'
# Define folder to save constructed data to
save_loc = '{}\\indices_{}\\'.format(working_directory, version)
if not os.path.exists(save_loc):
    os.makedirs(save_loc)

# If True, recompute language tree information (time consuming). If False, use previously computed trees.
# Must be True when running for the first time
recompute_trees = True

# Determine final row number of dataset for quality validation
num_final_records = 2*comb(242, 2) + 242

# Load raw input data
ethnologue_raw_location = '{}\\speaker_data.csv'.format(working_directory)
language_tree_location = "{}\\language_trees.csv".format(working_directory)
country_names_loc = "{}\\country_codes.dlm".format(working_directory)


raw_ethnologue_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Data is defined at the country-language level, covering each language spoken in each country. For example, the
# columns we make use of are structured as follows (actual values have been replaced with fake values to avoid disclosure):

# ISO_639  Country_Name  All_Users  L1_users  L2_users  Function_code  Sum_Of_Populations   [other columns omitted]
# aaa      Nigeria       45000        30000      15000                          150000000
# aab      Nigeria       25000        10000      15000            SPW           150000000
# aaa      Italy         10000         2000       8000            LRN            80000000


# List columns
print(raw_ethnologue_data.columns)
# Index(['ISO_639', 'Country_Code', 'Country_Name', 'Is_Primary',  'Is_Indigenous', 'Is_Established', 'All_Users',
#        'L1_Users', 'L2_Users', 'Function_Code', 'Family', 'Is_Written', 'Institutional', 'Developing',
#        'Vigorous', 'In_Trouble', 'Dying', 'Extinct', 'Country_Name.1', 'Indigenous', 'Established', 'Immigrant',
#        'Diversity', 'Included', 'Sum_Of_Populations', 'Population', 'Literacy_Rate', 'ISO3'],
#       dtype='object')

# Get list of countries in data
raw_countries = raw_ethnologue_data['ISO3'].unique().tolist()

# Check for missing language codes (There shouldn't be any but it requires special treatment of language "nan")
missing_lang = raw_ethnologue_data.loc[raw_ethnologue_data['ISO_639'].isna(),:]
if missing_lang.shape[0] >0:
    raise ValueError("Raw data has missing language values")

# ------
# 2. CREATE COMMON OFFICIAL LANGUAGE INDICES (COL and COR)
# ------
print("Step 2")
# Load fresh copy of raw data to work with
col_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Get list of different types of official functions
all_functions = col_data['Function_Code'].unique()
# Drop if function code is nan (i.e. language does not have an official function in a particular country)
col_data.dropna(subset=['Function_Code'], inplace=True)

# ----
# 2.1. Create wide definition of official
# ----
# Create cartesian product of country-language pairs
col_data.loc[:, 'key_col'] = 1
expanded = (col_data.merge(col_data, on='key_col', suffixes=('', '_2'))
            .drop('key_col', axis=1)
            .reset_index(drop=True))
# Drop cases where languages are not the same
expanded.drop(expanded.loc[expanded['ISO_639'] != expanded['ISO_639_2']].index, inplace=True)
# Define official languages column
expanded['col'] = 1
# Group by country-pair and take max value (i.e. at least 1 common language)
col = expanded.groupby(['ISO3', 'ISO3_2'], sort=False, as_index=False)['col'].max()


# ----
# 2.2. Create Narrow definition
# ----
# Narrow down to certain official functions
official = ['SNL', 'SNW', 'DNL', 'DNW']
narrower_data = col_data.loc[col_data['Function_Code'].isin(official)]
# Create cartesian product
expanded_narrow = (narrower_data.merge(narrower_data, on='key_col', suffixes=('', '_2'))
                   .drop('key_col', axis=1)
                   .reset_index(drop=True))
# Drop language mismatches
expanded_narrow.drop(expanded_narrow.loc[expanded_narrow['ISO_639'] != expanded_narrow['ISO_639_2']].index, inplace=True)

# Define narrow official language and group at country-pair level
expanded_narrow['col_narrow'] = 1
col_narrow = expanded_narrow.groupby(['ISO3', 'ISO3_2'], sort=False, as_index=False)['col_narrow'].max()


# ----
# 2.3. Build to full panel dimensions
# -----

# Create all possible country pairs
full_panel = [(i,j) for i in raw_countries for j in raw_countries]
full_panel = pd.DataFrame(full_panel, columns = ['ISO3', 'ISO3_2'])
# Add broad col variable
full_panel = full_panel.merge(col, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
# Add narrow col variable
full_panel = full_panel.merge(col_narrow, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
# Fill in empty with zero, signifying no common official language
full_panel.fillna(0, inplace = True)

# Check shape of data
if full_panel.shape[0]!=num_final_records:
    raise ValueError("Dimensions of fine panel not as expected.")

# Save data series
full_panel.to_csv("{}\\col_{}.csv".format(save_loc, version), index=False)



# -----
# 3. Create Common Native Language (CNL)
# -----
print("Step 3")

# -----
# 3.1. Create Common Native Language indices (CNL)
# ----

# Load fresh copy of raw data to work with
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])
# Construct native speakers (L1) as a share of population
lang_data['Speaker_Share'] = lang_data['L1_Users'] / lang_data['Sum_Of_Populations']
# Drop if no record of speakers of a language
lang_data.dropna(subset=['Speaker_Share'], inplace=True)

# Merge on the language code, creating all possible combinations of countries with a matching language
expanded = lang_data.merge(lang_data, on='ISO_639', suffixes=('', '_2'))

# Compute speaker product and sum to cnl measure
expanded['cnl'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2']
cnl = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'cnl':'sum'}))

# ---
# 3.2. Fill in zeros
# ---
def add_cnl_zeros(cnl_data):
    # Get list of countries
    lang_countries = cnl_data['ISO3'].unique().tolist()
    # Create all country pairs as dataframe
    lang_panel = [(i, j) for i in lang_countries for j in lang_countries]
    lang_panel = pd.DataFrame(lang_panel, columns = ['ISO3', 'ISO3_2'])
    # Merge non-zero CNL values onto full panel
    lang_panel = lang_panel.merge(cnl_data, how = 'outer', on = ['ISO3', 'ISO3_2'], validate = '1:1')
    # Fill remaining cels with zero
    lang_panel.fillna(0, inplace = True)
    return lang_panel

cnl = add_cnl_zeros(cnl)

# Save to file
cnl.to_csv(os.path.join(save_loc, 'cnl_{}.csv'.format(version)), index=False)



# ---------
# 4. Create Common Acquired Language Index (CAL)
# ---------
print("Step 4")
# -----
# 4.1. Compute CAL values
# ----
# Load fresh copy of raw data to work with
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Construct L2 speakers as a share of population
lang_data['Speaker_Share'] = lang_data['L2_Users'] / lang_data['Sum_Of_Populations']
# Drop if no record of speakers of a language
lang_data.dropna(subset=['Speaker_Share'], inplace=True)

# Drop if ISO-639 is missing
lang_data = lang_data.loc[~lang_data['ISO_639'].isna(),:]

# Expand to bilateral by matching all occurences of each language
expanded = lang_data.merge(lang_data, on='ISO_639', suffixes=('', '_2'))


# Compute speaker product and sum to cal measure
expanded['cal'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2']
cal = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'cal': 'sum'}))


# ---
# 4.2. Fill in zeros
# ---
# Unlike L1 speakers, some countries are missing any information about L2 speakers. We begin by removing these cases
# rather than treating them as being equal to zero.
raw_data_reloaded = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# ID countries with/without any L2 speaker data
l2_data = raw_data_reloaded.loc[~raw_data_reloaded['L2_Users'].isna(),:]
l2_countries = l2_data['ISO3'].unique().tolist()
non_l2_countries = [iso for iso in raw_countries if iso not in l2_countries]
# Add Afghanistan to missing list manually, which is missing L2 speakers for all but one language that has 0 L2 speakers
non_l2_countries.append('AFG')
l2_countries.remove('AFG')
# Compute expected number of CAL non-zero entries for validation purposes
l2_record_count = comb(len(l2_countries),2)*2 + len(l2_countries)

def add_cal_zeros(data):
    # Create sub panel among countries with L2 info so that zeros can be added to countries with no L2 ties
    cal_panel = [(i, j) for i in l2_countries for j in l2_countries]
    cal_panel = pd.DataFrame(cal_panel, columns = ['ISO3', 'ISO3_2'])
    cal_panel = cal_panel.merge(data, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
    cal_panel.fillna(0, inplace = True)

    # Create full panel of all countries and add data to it (does not fill in zeros this time)
    full_cal_panel = [(i, j) for i in raw_countries for j in raw_countries]
    full_cal_panel = pd.DataFrame(full_cal_panel, columns = ['ISO3', 'ISO3_2'])
    full_cal_panel = full_cal_panel.merge(cal_panel, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
    return full_cal_panel

cal = add_cal_zeros(cal)


# Check dimensions of entire panel and panel with non-zero entries
if not cal.shape[0] == num_final_records:
    raise ValueError("Final panel not of expected length.")
cal_no_missing = cal.loc[~cal['cal'].isna(),:]
if not cal_no_missing.shape[0] == l2_record_count:
    raise ValueError("Unexpected number of non-zero entries in panel not of expected length.")


# Save to file
cal.to_csv(os.path.join(save_loc, 'cal_{}.csv'.format(version)), index=False)



# ------
# 5. Create common spoken (native + acquired) language index (CSL)
# ------
print("Step 5")
# -----
# 5.1. Create CSL values
# ----

# Load fresh copy of source data
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Compare all users to L1 + L2
lang_data['user_diff'] = lang_data['All_Users'] - (lang_data['L1_Users'] + lang_data['L2_Users'])
lang_data.sort_values(['user_diff'], inplace = True)

# Construct all speakers as a share of population
lang_data['Speaker_Share'] = lang_data['All_Users']/ lang_data['Sum_Of_Populations']
# Drop if no record of speakers of a language
lang_data.dropna(subset=['Speaker_Share'], inplace=True)

# Replace cases (fewer than 30) in which speaker share > 1 with 1 (measurement error from Ethnologue)
pop_problems = lang_data.loc[lang_data['Speaker_Share']>1,:]
lang_data.loc[lang_data['Speaker_Share']>1,'Speaker_Share'] = 1

# Drop if ISO-639 is missing (i.e. no language is listed)
lang_data = lang_data.loc[~lang_data['ISO_639'].isna(),:]

# Expand to bilateral by matching all occurrences of each language
expanded = lang_data.merge(lang_data, on='ISO_639', suffixes=('', '_2'))


# Compute speaker product and sum to cal measure
expanded['csl'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2']
csl = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'csl': 'sum'}))

csl.sort_values(['csl'], inplace = True, ascending=False)

# ---
# 5.2. Fill in zeros
# ---

def add_csl_zeros(data):
    # Create sub panel among countries with L2 info so that zeros can be added to countries with no L2 ties
    sub_panel = [(i, j) for i in l2_countries for j in l2_countries]
    sub_panel = pd.DataFrame(sub_panel, columns = ['ISO3', 'ISO3_2'])
    sub_panel = sub_panel.merge(data, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
    sub_panel.fillna(0, inplace = True)

    # Create full panel of all countries and add data to it (does not fill in zeros this time)
    full_panel = [(i, j) for i in raw_countries for j in raw_countries]
    full_panel = pd.DataFrame(full_panel, columns = ['ISO3', 'ISO3_2'])
    full_panel = full_panel.merge(sub_panel, how = 'left', on = ['ISO3', 'ISO3_2'], validate = '1:1')
    return full_panel

csl = add_csl_zeros(csl)
csl.sort_values(['ISO3', 'ISO3_2'], inplace=True)

# Check dimensions of entire panel and panel with non-zero entries
if not csl.shape[0] == num_final_records:
    raise ValueError("Final panel not of expected length.")
csl_no_missing = csl.loc[~csl['csl'].isna(), :]
if not csl_no_missing.shape[0] == l2_record_count:
    raise ValueError("Unexpected number of non-zero entries in panel not of expected length.")

# Save to file
csl.to_csv(os.path.join(save_loc, 'csl_{}.csv'.format(version)), index=False)



# --------
# 6: Create language tree proximity measures
# -------
print("Step 6")

# Load language tree data
lang_tree_raw = pd.read_csv(language_tree_location, encoding='latin',  keep_default_na=False, na_values=[''])

# Check for missing values (there shouldn't be any)
lang_tree_missing = lang_tree_raw.loc[lang_tree_raw.isna().any(axis=1),:]
if lang_tree_missing.shape[0]>0:
    raise ValueError("Language tree data is missing values")


# The language tree data is structered as in the following example (actual values have been replaced with fake values
# to avoid disclosure):

# Language_Code	  ISO_639	            Family	                                               Classification
#             1	      aaa	           Cariban	                              Cariban, South Amazonian, Arara
#             2	      aab	  Trans-New Guinea	                        Trans-New Guinea, Madang, Kalam-Kobon
#             3	      aac	      Sino-Tibetan	 Sino-Tibetan, Tibeto-Burman, Ngwi-Burmese, Burmish, Northern
#             4	      aad	    Austro-Asiatic	                  Austro-Asiatic, Mon-Khmer, Viet-Muong, Chut


pd.set_option('display.max_columns', 500)
lang_tree_raw.head()

# Set the maximum number of language levels, which is 14 according to the documentation and data
max_class_length = 14

#---------
# 6a: Split classification into multiple columns
#---------
lang_wide = lang_tree_raw.copy()
# Create column names
level_names = ['level_'+str(num+1) for num in range(max_class_length)]
# Create columns and fill with NaN
for col_name in level_names:
    lang_wide[col_name] = np.NaN
    lang_wide[col_name] = lang_wide[col_name].astype('str')

# Fill columns for each row
for row in lang_wide.index:
    # select and split classification into a list of individual families
    classification = lang_wide.loc[row, 'Classification']
    classification = classification.split(",")

    #Place each family into the appropriate colun
    for position in range(len(classification)):
        lang_wide.loc[row, level_names[position]] = classification[position]

family_list = lang_wide['Family'].unique().tolist()

#----------
# 6b: Determine Family Tree Proximity
#----------

# To split the problem into more manageable chunks, calculate 100 languages at a time and store each group in a
# intermediate file to be combined afterwards
lang_list = lang_tree_raw.index.tolist()
num_langs = len(lang_list)
# Divide languages into sublists of 100 items
lang_subsets = [lang_list[i * 100:(i + 1) * 100] for i in range((len(lang_list) + 100 - 1) // 100)]
num_subsets = len(lang_subsets)

# Create a folder to store the sub-process data files created for the trees
intermediate_file_loc = save_loc+'\\lang_tree_intermediary_data_files'
if not os.path.exists(intermediate_file_loc):
    os.makedirs(intermediate_file_loc)

# This process is slow and may not need to be repeated if already run, "recompute_trees==False" bi-passes this step
if recompute_trees:
    lang_tree = lang_tree_raw.copy()
    # Count is an index to save files under, subset is a collection of 100 languages
    for count, subset in enumerate(lang_subsets):
        # Define file name
        subset_file_name = 'language_family_tree_proximity_{}_{}.csv'.format(version,count)

        # Create the data subset
        print('Calculating language group {} of {}'.format(count+1, num_subsets))

        # Define a list to store each row in the ultimate dataset
        row_list = list()
        # Loop through languages in the subset, which correspond to indexes in the raw data dataframe
        for row_1 in subset:
            # Grab language ID and classification string
            lang_1 = lang_tree.loc[row_1, 'ISO_639']
            class_1 = lang_tree.loc[row_1, 'Classification']
            # split the classificatiuon string into a list of branches
            class_1 = class_1.split(",")
            len_1 = len(class_1)

            # Loop Through all languages as a partner language and derive the same information as lang_1
            for row_2 in lang_tree.index:
                lang_2 = lang_tree.loc[row_2, 'ISO_639']
                class_2 = lang_tree.loc[row_2, 'Classification']
                class_2 = class_2.split(",")
                len_2 = len(class_2)

                # Derive basic characteristics of the two languages
                shortest_tree = min(len_1, len_2)
                longest_tree = max(len_1, len_2)
                ave_tree = (len_1 + len_2)/2

                # Determine the level of match by sequentially comparing each element in both lists of branches
                match_level = 0
                for fam in range(shortest_tree):
                    # If branches match, increment up the match counter, if not, terminate this loop
                    if class_1[fam] == class_2[fam]:
                        match_level += 1
                    else:
                        break

                # Store results in a dictionary and add to the list of all dictionaries
                lp_row = {'lang_1': lang_1,
                          'lang_2': lang_2,
                          'branch_match': match_level,
                          'lang_1_tree_length': len_1,
                          'lang_2_tree_length': len_2,
                          'proximity_score':match_level/ave_tree}

                row_list.append(lp_row)

        # Create a dataframe from the rows and output the subset of data.
        lp1_basic = pd.DataFrame(row_list)
        lp1_basic.to_csv((intermediate_file_loc + '\\' + subset_file_name), index = False)
    else:
        pass


# Compile subset files into a single dataset
lp_files = os.listdir(intermediate_file_loc)
all_tree_data = list()
for file in lp_files:
    data_subset = pd.read_csv(intermediate_file_loc + '\\' + file,  keep_default_na=False, na_values=[''])
    all_tree_data.append(data_subset)

full_tree_data = pd.concat(all_tree_data, axis = 0)
if not full_tree_data.shape[0] == num_langs ** 2:
    raise ValueError('Number of constructed rows is not equal to the number of expected rows (n^2)')


# Create alternative "flat" measure for "branch proximity" that does not take into account total tree length
#   branch split = mean_total_length - common_branches    [increasing in fractionalization]
#   normalized proximity = (max_length - split)/max_length    [decreasing in fractionalization, constrained to [0,1]]
full_tree_data['proximity_score_flat'] = 0.5*(full_tree_data['lang_1_tree_length']+ full_tree_data['lang_2_tree_length']) - full_tree_data['branch_match']
full_tree_data['proximity_score_flat'] = (max_class_length - full_tree_data['proximity_score_flat'])/max_class_length
# Set languages with zero matches (i,e, on different trees) to zero
full_tree_data.loc[full_tree_data['branch_match']==0, 'proximity_score_flat']=0

# Relabel Language identifier columns
full_tree_data.rename(columns = {'lang_1':'ISO_639', 'lang_2':'ISO_639_2'}, inplace = True)

# Create a smaller version with only languages with some level of relationship
tree_proximity = full_tree_data.loc[full_tree_data['proximity_score'] > 0,:].copy()

tree_proximity = tree_proximity.sort_values(['ISO_639', 'ISO_639_2'])
tree_summary = tree_proximity.describe()




# ------
# 7: Compute Bilateral Native Linguistic Proximity measures
# ------
print("Step 7")
# Load fresh copy of data to work with
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Create Speaker share
lang_data['Speaker_Share'] = lang_data['L1_Users'] / lang_data['Sum_Of_Populations']

# Drop cases with no speakers
lang_data.dropna(subset=['Speaker_Share'], inplace=True)
# Drop no-longer needed columns
lang_data.drop({'L1_Users', 'Sum_Of_Populations'}, axis=1, inplace=True)

# Create merge key for cartesian merge
lang_data['key_col'] = 1

# Drop several rows where there is no language listed
lang_data = lang_data.loc[~lang_data['ISO_639'].isna(),:]

# Create all country-language pairs
expanded = (lang_data.merge(lang_data, on='key_col', suffixes=('', '_2'))
            .drop('key_col', axis=1)
            .reset_index(drop=True))

if lang_data.shape[0]**2 != expanded.shape[0]:
    raise ValueError("Cartesian Combination not the right shape.")

# Add LP to language pairs
expanded = pd.merge(expanded, tree_proximity, on=['ISO_639', 'ISO_639_2'], how='left')

# DROP SAME LANGUAGE FROM PROXIMITY DATA
expanded = expanded.loc[(expanded['ISO_639'] != expanded['ISO_639_2']),:]


# Fill in cases where there is no proximity
expanded['proximity_score'] = expanded['proximity_score'].fillna(0)
expanded['proximity_score_flat'] = expanded['proximity_score_flat'].fillna(0)


# Compute LP-weighted joint speaker share for (l)inguistic and flat (b)ranch indices
expanded['lpn'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score']
expanded['bpn'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score_flat']

# Compute sum over all language combinations
lp_baseline = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'lpn':'sum', 'bpn':'sum'}))

# Fill in zeros where necessary
def add_lpn_zeros(lp_data):
    lang_countries = lp_data['ISO3'].unique().tolist()
    lang_panel = [(i, j) for i in lang_countries for j in lang_countries]
    lang_panel = pd.DataFrame(lang_panel, columns = ['ISO3', 'ISO3_2'])
    lang_panel = lang_panel.merge(lp_data, how ='outer', on = ['ISO3', 'ISO3_2'], validate ='1:1')
    lang_panel.fillna(0, inplace = True)
    return lang_panel

lp_baseline = add_lpn_zeros(lp_baseline)

lp_baseline.to_csv(os.path.join(save_loc, 'lpn_bpn_{}.csv'.format(version)), index=False)


# ------
# 8: Create acquired linguistic proximity measures (LPA and BPA)
# ------
print("Step 8")
# Load fresh copy of raw data again
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Create Speaker share
lang_data['Speaker_Share'] = lang_data['L2_Users'] / lang_data['Sum_Of_Populations']
# Drop cases with no speakers
lang_data.dropna(subset=['Speaker_Share'], inplace=True)
# Drop no-longer needed columns
lang_data.drop({'L2_Users', 'Sum_Of_Populations'}, axis=1, inplace=True)

# Create merge key for cartesian merge
lang_data['key_col'] = 1

# Drop several rows where there is no language listed
lang_data = lang_data.loc[~lang_data['ISO_639'].isna(),:]


# Create cartesian product between every country and language
expanded = (lang_data.merge(lang_data, on='key_col', suffixes=('', '_2'))
            .drop('key_col', axis=1)
            .reset_index(drop=True))

if lang_data.shape[0]**2 != expanded.shape[0]:
    raise ValueError("Cartesian Combination not the right shape.")

# Add LP
expanded = pd.merge(expanded, tree_proximity, on=['ISO_639', 'ISO_639_2'], how='left')

# DROP SAME LANGUAGE FROM PROXIMITY DATA
expanded = expanded.loc[(expanded['ISO_639'] != expanded['ISO_639_2']),:]

# Fill in cases where there is no proximity (the proximity dataframe only includes cases with >0 proximity from same
#     proto-language)
expanded['proximity_score'] = expanded['proximity_score'].fillna(0)
expanded['proximity_score_flat'] = expanded['proximity_score_flat'].fillna(0)

# Compute LP-weighted joint speaker share
expanded['lpa'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score']
expanded['bpa'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score_flat']


# Compute sum over all language combinations
lp_baseline = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'lpa':'sum', 'bpa':'sum'}))


# Fill in zeros and replace zeros with missing for countries without data
def add_lpa_zeros(lp_data):
    # Construct sub-panel for countries with L2 data to include zeros
    lang_panel = [(i, j) for i in l2_countries for j in l2_countries]
    lang_panel = pd.DataFrame(lang_panel, columns = ['ISO3', 'ISO3_2'])
    lang_panel = lang_panel.merge(lp_data, how ='left', on = ['ISO3', 'ISO3_2'], validate ='1:1')
    lang_panel.fillna(0, inplace = True)

    # Add to full panel while keeping non-L2 country entries as missing
    full_panel = [(i, j) for i in raw_countries for j in raw_countries]
    full_panel = pd.DataFrame(full_panel, columns=['ISO3', 'ISO3_2'])
    full_panel = full_panel.merge(lang_panel, how='left', on=['ISO3', 'ISO3_2'], validate='1:1')
    return full_panel

lp_baseline = add_lpa_zeros(lp_baseline)

# Check dimensions of entire panel and panel with non-zero entries
if not lp_baseline.shape[0] == num_final_records:
    raise ValueError("Final panel not of expected length.")
cal_no_missing = lp_baseline.loc[~lp_baseline['lpa'].isna(),:]
if not cal_no_missing.shape[0] == l2_record_count:
    raise ValueError("Unexpected number of non-zero entries in panel not of expected length.")


lp_baseline.sort_values(['ISO3','ISO3_2'], inplace = True)
lp_baseline.to_csv(os.path.join(save_loc, 'lpa_bpa_{}.csv'.format(version)), index=False)




# ---------------------
# 9. Create spoken linguistic proximity measures (LPS and BPS)
# ------------------------
print("Step 9")
# Load Ethnologue Data again
lang_data = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])

# Create Speaker share
lang_data['Speaker_Share'] = lang_data['All_Users'] / lang_data['Sum_Of_Populations']
# Drop cases with no speakers
lang_data.dropna(subset=['Speaker_Share'], inplace=True)
# Drop no-longer needed columns
lang_data.drop({'L2_Users', 'Sum_Of_Populations'}, axis=1, inplace=True)

# Create merge key for cartesian merge
lang_data['key_col'] = 1

# Drop several rows where there is no language listed
lang_data = lang_data.loc[~lang_data['ISO_639'].isna(),:]


# Create cartesian product between every country and language
expanded = (lang_data.merge(lang_data, on='key_col', suffixes=('', '_2'))
            .drop('key_col', axis=1)
            .reset_index(drop=True))

if lang_data.shape[0]**2 != expanded.shape[0]:
    raise ValueError("Cartesian Combination not the right shape.")

# Add LP
expanded = pd.merge(expanded, tree_proximity, on=['ISO_639', 'ISO_639_2'], how='left')

# DROP SAME LANGUAGE FROM PROXIMITY DATA
expanded = expanded.loc[(expanded['ISO_639'] != expanded['ISO_639_2']),:]

# Fill in cases where there is no proximity (the proximity data only includes cases with >0 proximity from same
#     proto-language)
expanded['proximity_score'] = expanded['proximity_score'].fillna(0)
expanded['proximity_score_flat'] = expanded['proximity_score_flat'].fillna(0)

# Compute LP-weighted joint speaker share
expanded['lps'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score']
expanded['bps'] = expanded['Speaker_Share'] * expanded['Speaker_Share_2'] * expanded['proximity_score_flat']


# Compute sum over all language combinations
lp_baseline = (expanded.groupby(['ISO3', 'ISO3_2'], as_index=False).agg({'lps':'sum', 'bps':'sum'}))


# Fill in zeros and replace zeros with missing for countries without data
def add_lps_zeros(lp_data):
    # Construct sub-panel for countries with L2 data to include zeros
    lang_panel = [(i, j) for i in l2_countries for j in l2_countries]
    lang_panel = pd.DataFrame(lang_panel, columns = ['ISO3', 'ISO3_2'])
    lang_panel = lang_panel.merge(lp_data, how ='left', on = ['ISO3', 'ISO3_2'], validate ='1:1')
    lang_panel.fillna(0, inplace = True)

    # Add to full panel while keeping non-L2 country entries as missing
    full_panel = [(i, j) for i in raw_countries for j in raw_countries]
    full_panel = pd.DataFrame(full_panel, columns=['ISO3', 'ISO3_2'])
    full_panel = full_panel.merge(lang_panel, how='left', on=['ISO3', 'ISO3_2'], validate='1:1')
    return full_panel

lp_baseline = add_lps_zeros(lp_baseline)

# Check dimensions of entire panel and panel with non-zero entries
if not lp_baseline.shape[0] == num_final_records:
    raise ValueError("Final panel not of expected length.")
cal_no_missing = lp_baseline.loc[~lp_baseline['lps'].isna(),:]
if not cal_no_missing.shape[0] == l2_record_count:
    raise ValueError("Unexpected number of non-zero entries in panel not of expected length.")


lp_baseline.sort_values(['ISO3','ISO3_2'], inplace = True)
lp_baseline.to_csv(os.path.join(save_loc, 'lps_bps_{}.csv'.format(version)), index=False)


# ---------------------------
# 10: Combine all indices
# ---------------------------
print("Step 10")


indices_loc = ["{}/indices_v2/col_{}.csv".format(working_directory, version),
               "{}/indices_v2/cnl_{}.csv".format(working_directory, version),
               "{}/indices_v2/cal_{}.csv".format(working_directory, version),
               "{}/indices_v2/csl_{}.csv".format(working_directory, version),
               "{}/indices_v2/lpn_bpn_{}.csv".format(working_directory, version),
               "{}/indices_v2/lpa_bpa_{}.csv".format(working_directory, version),
               "{}/indices_v2/lps_bps_{}.csv".format(working_directory, version)]





# ---
# Prep full panel
# ---

# Create full list of countries
raw_lang = pd.read_csv(ethnologue_raw_location, keep_default_na=False, na_values=[''])
lang_countries = raw_lang['ISO3'].unique().tolist()

# Create square panel from list of countries
lang_pairs = [(i, j) for i in lang_countries for j in lang_countries]
panel = pd.DataFrame(lang_pairs, columns = ['ISO3', 'ISO3_2'])
panel.sort_values(by=['ISO3', 'ISO3_2'], inplace = True)


# ---
# Add each set of indices
# ---

for index_file in indices_loc:
    merge_id = "merge_" + index_file[-10:-7]
    index_raw = pd.read_csv(index_file)
    panel = panel.merge(index_raw, on = ['ISO3', 'ISO3_2'], how = 'outer', validate = '1:1',
                        indicator=merge_id)
    right_merge_only = panel.loc[panel[merge_id]=='right_only',:]
    left_merge_only = panel.loc[panel[merge_id]=='left_only',:]
    print("{} right_only: {} rows".format(merge_id, right_merge_only.shape[0]))
    print("{} left_only: {} rows".format(merge_id, left_merge_only.shape[0]))


merge_cols = [col for col in panel.columns if col.startswith('merge')]
panel.drop(merge_cols, axis = 1 , inplace=True)




# ----
# Add country names
# ----

country_names = pd.read_csv(country_names_loc)
panel = panel.merge(country_names, how = 'left', left_on=['ISO3'], right_on=['iso3'], validate='m:1')
panel = panel.merge(country_names, how = 'left', left_on=['ISO3_2'], right_on=['iso3'], validate='m:1')

# relabel some columns
panel.rename(columns={'ISO3':'iso3_i', 'ISO3_2':'iso3_j', 'country_x':'country_i', 'country_y':'country_j'}, inplace=True)
panel.loc[panel['country_i'].isna(),:]

# Reorder columns and drop duplicate iso codes from merge
panel = panel[['iso3_i', 'country_i', 'iso3_j', 'country_j', 'col', 'col_narrow', 'cnl', 'cal', 'csl', 'lpn', 'lpa', 'lps', 'bpn', 'bpa', 'bps']]
panel.sort_values(['iso3_i', 'iso3_j'], inplace = True)

# ---
# Output data
# ---

panel.rename(columns = {'col_narrow':'cor'}, inplace = True)

panel.to_csv("{}DICL_{}.dlm".format(save_loc, version), index=False)

# Prep a Stata version
stata_labels = {'iso3_i': "Country i ISO 3-digit alpha identifier",
                'country_i': 'Country i name',
                'iso3_j': "Country j ISO 3-digit alpha identifier",
                'country_j':'Country j name',
                'col': "Common official language indicator",
                'cor': "Restricted official lang indicator based on narrower definition of official lang",
                'cnl': "Common native language index",
                'cal': "Common acquired language index",
                'csl': "Common spoken language index (native and acquired)",
                'lpn': "Linguistic proximity index for different native languages",
                'lpa': "Linguistic proximity index for different acquired languages",
                'lps': "Linguistic proximity index for different spoken languages (native and acquired)",
                'bpn': "Linguistic branch proximity index for different native languages",
                'bpa': "Linguistic branch proximity index for different acquired languages",
                'bps': "Linguistic branch proximity index for different spoken langs. (nat. and acq.)"
                }
len(stata_labels['cor'])
panel.to_stata("{}DICL_{}.dta".format(save_loc, version), write_index=False,
               variable_labels=stata_labels)


# ----
# Create README
# ----



readme_info = dict()
readme_info['Description'] = \
"""The Domestic and International Common Language (DICL) Dataset contains a collection of country-level and bilateral 
measures of language connections. The 11 DICL indices reflect multiple dimensions of linguistic relationships between 
populations, including common official languages, common spoken languages, and intelligibility between different 
languages. Several measures also differentiate between native languages and acquired languages."""

readme_info['Licensing'] = """CC-BY 4.0 (https://creativecommons.org/licenses/by/4.0/)."""

readme_info['Recommended citation'] = '''Gurevich, T., P.R. Herman, F. Toubal, and Y.V. Yotov. (2024) "The Domestic and International Common 
    Language (DICL) Database." USITC Economics Working Paper 2024-03-A.'''


readme_info['Contents'] = \
"""The DICL database contains 15 columns and 58,564 rows comprised of indices for 29,403 unique country pairs. For 
convenience, the international records are mirrored so that there is a record for both the pair (i, j) and (j, i). 
The domestic records appear once for each country (2 * 29,161 mirrored international records + 242 domestic measures 
= 58,564 total records). The 12 columns contain each of the 8 language measures described in the previous section 
as well as names and ISO 3-digit alpha identifiers for each country. The first row of each column contains a column label."""

technical_description = """
    The official language indicators (col and cor) are defined based on the statuses of official languages in each country.
    col is defined such that col_ij = 1 if country i and j share at least one official language, whether national or 
    provincial, statutory or de facto. Otherwise, col_ij = 0. cor is defined such that cor_ij = 1 if country i and j share 
    at least one national statutory or de facto official language. otherwise, cor_ij = 0. For the domestic indices (i=j), 
    col and cor are set equal to 1 if the country has at least one official language, and is zero otherwise. 
    
    The common language indices (cnl, cal, and csl) are defined based on the population of speakers of each language in each
    country. Let l_ki denote the share of native language speakers of language k in country i. cnl is defined as 
    cnl_ij = sum_k (l_ki x l_kj) for all k. cal is defined as cal_ij = sum_k (a_ki x a_kj) for all k, where a_ki denotes 
    the share of acquired language speakers of language k in country i. Finally, csl is defined as 
    csl_ij = sum_k [(l_ik + a_ki) x (l_jk + a_kj)] for all k. 
    
    The linguistic proximity indices (lpn, lpa, and lps) are defined based on the structure of their respective linguistic 
    trees. Let b_k denote the number of branches in the linguistic family tree for language k. Let b_kh denote the number 
    of branches that are common to two languages k and h. The linguistic proximity between these languages is given by 
    P_kh = b_kh / [0.5 x (b_k + b_h)], reflecting the average proportion of the two language trees that they share. Using 
    the notation defined above, the lpn index is defined as lpn_ij = sum_k sum_{h!=k} (l_ik x l_jh x P_kh) for all k and h. 
    Notably, cases where k=h are excluded so that the measure does not include same-language pairs. The lpa index is 
    similarly defined as  lpa_ij = sum_k sum_{h!=k} (a_ik x a_jh x P_kh)  for all k and h. Finally, the lps index is 
    defined as lps_ij = sum_k sum_{h!=k} [(l_ik + a_ik) x (l_jh + a_jh) x P_kh]  for all k and h. 
    
    The branch proximity indices (bpn, bpa, and bps) are defined using the same linguistic tree information as the 
    linguistic proximity indices. Letting M denote the maximum tree length across all languages (M = max(b_k)), define 
    branch proximity as R_kh = (M - [0.5 x (b_k + b_h) - b_kh]) / M. If two languages are from different language trees 
    and share no common branches, R_kh = 0. Using branch proximity, the bpn index is defined as
    bpn_ij = sum_k sum_{h!=k} (l_ik x l_jh x B_kh). The bpa index is similarly defined as  
    bpa_ij = sum_k sum_{h!=k} (a_ik x a_jh x R_kh)  for all k and h. Finally, the bps index is defined as 
    bps_ij = sum_k sum_{h!=k} [(l_ik + a_ik) x (l_jh + a_jh) x R_kh]  for all k and h. 
"""


readme_file = "{}README_DICL.txt".format(save_loc)
with open(readme_file, "w") as readme:
    readme.write("README for DICL Database\n\n")
    for item, text in readme_info.items():
        readme.write("{}:\n".format(item))
        readme.write("\t{}\n\n".format(text))
    readme.write("Variables:\n")
    for key, label in stata_labels.items():
        readme.write("\t{}:\t{}\n".format(key, label))
    readme.write("\n\nTechnical definitions:")
    readme.write("\t{}\n\n".format(technical_description))


